#include "opencl_source_map.hpp" 
namespace MNN { 
const char* pooling = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
"__kernel void pooling(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,\n"
" __private const int2 input_shape,__private const int output_height,__private const int2 pad_shape,\n"
" __private const int2 stride_shape,\n"
" __private const int2 kernel_shape,\n"
" __write_only image2d_t output,\n"
" __write_only image2d_t rediceOutput) {\n"
" const int output_channel_idx=get_global_id(0);\n"
" const int output_width_idx=get_global_id(1);\n"
" const int output_batch_height_idx=get_global_id(2);\n"
" DEAL_NON_UNIFORM_DIM3(output_channel_idx,output_width_idx,output_batch_height_idx);\n"
" const int output_width=global_size_dim1;\n"
" const int output_batch_idx=output_batch_height_idx/output_height;\n"
" const int output_height_idx=output_batch_height_idx-mul24(output_batch_idx,output_height);\n"
" const int input_start=mul24(output_batch_idx,input_shape.x);\n"
" const int input_height_start=mad24(output_height_idx,stride_shape.x,-pad_shape.x);\n"
" const int input_width_start=mad24(output_width_idx,stride_shape.y,-pad_shape.y);\n"
" const int input_channel_start=mul24(output_channel_idx,input_shape.y);\n"
" #ifdef RETURN_REDICE\n"
" int4 redice=(int4)0;\n"
" #endif\n"
"#ifdef POOL_AVG\n"
" FLOAT4 output_result=0;\n"
" for (int height=0; height<kernel_shape.x; height++) {\n"
" int input_height_idx=input_height_start+height;\n"
" input_height_idx =\n"
" select(input_start+input_height_idx,-1,(input_height_idx<0 || input_height_idx >= input_shape.x));\n"
" for (int width=0; width<kernel_shape.y; width++) {\n"
" int input_width_idx=input_width_start+width;\n"
" input_width_idx =\n"
" select(input_channel_start+input_width_idx,-1,(input_width_idx<0 || input_width_idx >= input_shape.y));\n"
" FLOAT4 input_data=RI_F(input,SAMPLER,(int2)(input_width_idx,input_height_idx));\n"
" output_result=output_result+input_data;\n"
" }\n"
" }\n"
" const int kernel_height_start=max(0,input_height_start);\n"
" const int kernel_width_start=max(0,input_width_start);\n"
" const int kernel_height_end=min(input_height_start+kernel_shape.x,input_shape.x);\n"
" const int kernel_width_end=min(input_width_start+kernel_shape.y,input_shape.y);\n"
" #ifdef COUNT_INCLUDE_PADDING\n"
" const int block_size=(min(input_height_start+kernel_shape.x,input_shape.x+pad_shape.x)-input_height_start)*(min(input_width_start+kernel_shape.y,input_shape.y+pad_shape.y)-input_width_start);\n"
" #else\n"
" const int block_size=mul24((kernel_height_end-kernel_height_start),(kernel_width_end-kernel_width_start));\n"
" #endif\n"
" const FLOAT block_float_req=(FLOAT)1.0f/(FLOAT)block_size;\n"
" output_result=output_result*block_float_req;\n"
"#else\n"
" FLOAT4 output_result=(FLOAT4)(-FLT_MAX);\n"
" for (int height=0; height<kernel_shape.x; height++) {\n"
" int input_height_idx=input_height_start+height;\n"
" input_height_idx =\n"
" select(input_start+input_height_idx,-1,(input_height_idx<0 || input_height_idx >= input_shape.x));\n"
" if (input_height_idx != -1) {\n"
" for (int width=0; width<kernel_shape.y; width++) {\n"
" int input_width_idx=input_width_start+width;\n"
" input_width_idx=select(input_channel_start+input_width_idx,-1,\n"
" (input_width_idx<0 || input_width_idx >= input_shape.y));\n"
" if (input_width_idx != -1) {\n"
" FLOAT4 input_data=RI_F(input,SAMPLER,(int2)(input_width_idx,input_height_idx));\n"
" #ifdef RETURN_REDICE\n"
" redice=input_data>output_result ? (int4)((input_height_start+height)*input_shape.y+input_width_start+width) : redice;\n"
" #endif\n"
" output_result=fmax(output_result,input_data);\n"
" }\n"
" }\n"
" }\n"
" }\n"
"#endif\n"
" const int output_channel_width_idx=mad24(output_channel_idx,output_width,output_width_idx);\n"
" WI_F(output,(int2)(output_channel_width_idx,output_batch_height_idx),output_result);\n"
" #ifdef RETURN_REDICE\n"
" WI_F(rediceOutput,(int2)(output_channel_width_idx,output_batch_height_idx),CONVERT_FLOAT4(redice));\n"
" #endif\n"
"}\n"
"#if LOCAL_SIZE>1\n"
"__kernel void global_pooling(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,\n"
" __private const int2 input_shape,__private const int output_height,__private const int2 pad_shape,\n"
" __private const int2 stride_shape,\n"
" __private const int2 kernel_shape,\n"
" __write_only image2d_t output,\n"
" __write_only image2d_t rediceOutput) {\n"
" const int local_id=get_local_id(0);\n"
" const int output_channel_idx=get_global_id(1);\n"
" const int output_batch_idx=get_global_id(2);\n"
"#ifdef POOL_AVG\n"
" FLOAT4 output_result=0;\n"
"#else\n"
" FLOAT4 output_result=(FLOAT4)(-FLT_MAX);\n"
"#endif\n"
"#ifdef RETURN_REDICE\n"
" int4 redice=(int4)0;\n"
" int4 local rediceId[LOCAL_SIZE];\n"
"#endif\n"
" FLOAT4 local sum_mnn[LOCAL_SIZE];\n"
" int wc=output_channel_idx*input_shape.y;\n"
" int bh=output_batch_idx*input_shape.x;\n"
" for(int i=local_id; i<input_shape.x*input_shape.y; i+=LOCAL_SIZE){\n"
" int w=i % input_shape.y;;\n"
" int h=i/input_shape.y;\n"
" FLOAT4 in=RI_F(input,SAMPLER,(int2)(wc+w,bh+h));\n"
"#ifdef POOL_AVG\n"
" output_result += in;\n"
"#else\n"
" output_result=fmax(output_result,in);\n"
"#ifdef RETURN_REDICE\n"
" redice=in>output_result ? (int4)(i) : redice;\n"
"#endif\n"
"#endif\n"
" }\n"
" \n"
" sum_mnn[local_id]=output_result;\n"
"#ifdef RETURN_REDICE\n"
" rediceId[local_id]=redice;\n"
"#endif\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (local_id<i)\n"
"#ifdef POOL_AVG\n"
" sum_mnn[local_id]=sum_mnn[local_id]+sum_mnn[local_id+i];\n"
"#else\n"
" {\n"
"#ifdef RETURN_REDICE\n"
" rediceId[local_id]=sum_mnn[local_id]>sum_mnn[local_id+i] ? rediceId[local_id] : rediceId[local_id+i];\n"
"#endif\n"
" sum_mnn[local_id]=fmax(sum_mnn[local_id],sum_mnn[local_id+i]);\n"
" }\n"
"#endif\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" output_result=sum_mnn[0];\n"
"#ifdef POOL_AVG\n"
" output_result /= (input_shape.x*input_shape.y);\n"
"#endif\n"
" WI_F(output,(int2)(output_channel_idx,output_batch_idx),output_result);\n"
" #ifdef RETURN_REDICE\n"
" redice=rediceId[0];\n"
" WI_F(rediceOutput,(int2)(output_channel_idx,output_batch_idx),CONVERT_FLOAT4(redice));\n"
" #endif\n"
"}\n"
"#endif\n"
;
}
